{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Evaluations\n",
    "\n",
    "This notebook shows how to pull traces from a running phoenix instance and evaluate them using the `arize-phoenix-evals` library."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install \"arize-phoenix[evals]\" openai nest_asyncio"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Run async evaluation in the notebook\n",
    "import nest_asyncio\n",
    "\n",
    "nest_asyncio.apply()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import phoenix as px\n",
    "\n",
    "client = px.Client(endpoint=\"http://localhost:6006\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from phoenix.trace.dsl.helpers import get_qa_with_reference, get_retrieved_documents\n",
    "\n",
    "qa_df = get_qa_with_reference(client)\n",
    "documents_df = get_retrieved_documents(client)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "qa_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "documents_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Evaluate Retrieval\n",
    "\n",
    "from phoenix.evals import (\n",
    "    OpenAIModel,\n",
    "    RelevanceEvaluator,\n",
    "    run_evals,\n",
    ")\n",
    "\n",
    "relevance_evaluator = RelevanceEvaluator(OpenAIModel(model=\"gpt-4-turbo-preview\"))\n",
    "\n",
    "relevance_evals = run_evals(\n",
    "    evaluators=[relevance_evaluator],\n",
    "    dataframe=documents_df,\n",
    "    provide_explanation=True,\n",
    "    concurrency=20,\n",
    ")[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "relevance_evals.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Evaluate Responses\n",
    "\n",
    "from phoenix.evals import (\n",
    "    HallucinationEvaluator,\n",
    "    OpenAIModel,\n",
    "    QAEvaluator,\n",
    "    run_evals,\n",
    ")\n",
    "\n",
    "qa_evaluator = QAEvaluator(OpenAIModel(model=\"gpt-4-turbo-preview\"))\n",
    "hallucination_evaluator = HallucinationEvaluator(OpenAIModel(model=\"gpt-4-turbo-preview\"))\n",
    "\n",
    "qa_evals, hallucination_evals = run_evals(\n",
    "    evaluators=[qa_evaluator, hallucination_evaluator],\n",
    "    dataframe=qa_df,\n",
    "    provide_explanation=True,\n",
    "    concurrency=20,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from phoenix.trace import DocumentEvaluations, SpanEvaluations\n",
    "\n",
    "# Log the evaluations back to\n",
    "client.log_evaluations(\n",
    "    DocumentEvaluations(dataframe=relevance_evals, eval_name=\"relevance\"),\n",
    "    SpanEvaluations(dataframe=qa_evals, eval_name=\"qa\"),\n",
    "    SpanEvaluations(dataframe=hallucination_evals, eval_name=\"hallucination\"),\n",
    ")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "dspy",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
